String Matching

- Navie String Matching
- Rabin Karp Algorithm
- Finite Automaton
- Knuth Morris Patt Algorithm

|Algorithm|Preprocessing|Matching|

Algorithm Preprocessing Matching
Naiv 0 O((nm+1)m)
Rabin Karp Θ(m) O((nm+1)m)
Finite Automaton O(mΣ) Θ(n)
Knuth Morris Patt Θ(m) Θ(n)
단순 스트링 매칭 알고리즘(Naive String Matching)
#include <iostream>
#include <string>
using namespace std;
int NaiveStringMatcher(string T, string P){
int n=T.length(), m=P.length();
for(int s=0; n-m+1; ++s){
if(T.substr(s, m)==P)
return s;
}
return -1;
}
int main(void){
string T="acaabc";
string P="aab";
int point=NaiveStringMatcher(T, P);
std::cout<<"Point: "<<point<<'\n';
if(point!=-1){
std::cout<<"substr: "<<T.substr(point, P.length())<<std::endl;
}
return 0;
}
Rabin Karp Algorithm
모든 문자열의 문자들을 기수 d(디지트)라고 생각하며, mod를 연속적으로 계산
#include <iostream>
#include <string>
#include <cmath>
using namespace std;
int RabinKarpMatcher(string T, string P, int d, int q){
int n=T.length(), m=P.length();
int h=static_cast<int>(pow(d, m-1))%q;
int p=0, t=0;
for(int i=0; i<m; ++i){
p=(d*p+static_cast<int>(P[i]))%q;
t=(d*t+static_cast<int>(T[i]))%q;
}
for(int s=0; s<n-m+1; ++s){
if(p==t){
if(P==T.substr(s, m)) return s;
}
if(s<n-m){
t=(d*(t-static_cast<int>(T[s])*h)+static_cast<int>(T[s+m]))%q;
t=t>0? t: t+q;
}
}
return -1;
}
int main(void){
string T="acaabc";
string P="aab";
int point=RabinKarpMatcher(T, P, 10, 13);
std::cout<<"Point: "<<point<<'\n';
if(point!=-1){
std::cout<<"substr: "<<T.substr(point, P.length())<<std::endl;
}
return 0;
}
Finite Automaton
#include <iostream>
#include <string>
#include <vector>
#include <map>
#include <algorithm>
#include <cmath>
using namespace std;
struct delta{
int n=0, m=0;
int** table;
map<char, int> alphaInd;
vector<char> Sigma;
delta(string T, string P){
this->n=P.length()+1;
for(int i=0; i<T.length(); ++i){
if(find(this->Sigma.begin(), this->Sigma.end(), T[i])==this->Sigma.end()){
this->Sigma.push_back(T[i]);
}
}
sort(Sigma.begin(), Sigma.end());
this->m=Sigma.size();
for(int i=0; i<this->m; ++i){
alphaInd[Sigma[i]]=i;
}
table=new int*[this->n];
for(int i=0; i<this->n; ++i) table[i]=new int[this->m];
}
int* address(int i, char chr){
int j=this->alphaInd.at(chr);
return &table[i][j];
}
};
bool suffix(string a, string b){
// assert b.length()>a.length()
if(a.length()>b.length()) return false;
for(int i=b.length()-a.length(), j=0; i<b.length(); ++i, j++){
if(a[j]!=b[i]) return false;
}
return true;
}
void ComputeTransitionFunction(string P, delta& table){
int m=P.length();
for(int q=0; q<m+1; ++q){
for(char a: table.Sigma){
int k=min(m+1, q+2);
do{
k--;
}while(!suffix(P.substr(0, k), P.substr(0, q)+a));
*table.address(q, a)=k;
}
}
}
int FiniteAutomatonMatcher(string T, int m, delta& table){
int n=T.length();
int q=0;
for(int i=0; i<n; ++i){
q=(*table.address(q, T[i]));
if(q==m) return i-m+1;
}
return -1;
}
int main(void){
string T="abababacaba";
string P="ababaca";
delta table(T, P);
ComputeTransitionFunction(P, table);
for(int i=0; i<table.n; ++i){
for(int j=0; j<table.m; ++j) std::cout<<table.table[i][j]<<' ';
std::cout<<'\n';
}
std::cout<<std::endl;
int point=FiniteAutomatonMatcher(T, P.length(), table);
std::cout<<"Point: "<<point<<'\n';
for(int i=0; i<P.length(); ++i){
std::cout<<T[point+i]<<' ';
}
std::cout<<std::endl;
return 0;
}
Knuth Morris Pratt Algorithm
Finite Automaton과 유사하지만, Prefix를 이용한 PI를 생성함(전처리)

KMP의 PI는 Finite Automaton의 delta의 역할을 대신함
#include <iostream>
#include <string>
#include <vector>
using namespace std;
vector<int> ComputePrefixFunction(string P){
int m=P.length();
vector<int> PI(m);
PI[0]=0;
int k=0;
for(int q=1; q<m; ++q){
while(k>0 && P[k]!=P[q]) k=PI[k];
if(P[k]==P[q]) k++;
PI[q]=k;
}
return PI;
}
int KMPMatcher(string T, string P){
int n=T.length(), m=P.length();
vector<int> PI=ComputePrefixFunction(P);
int q=0;
for(int i=0; i<n; ++i){
while(q>0 && P[q]!=T[i]) q=PI[q-1];
if(P[q]==T[i]) q++;
if(q==m) return i-m+1;
std::cout<<q<<std::endl;
}
return -1;
}
int main(void){
string T="bacbababaabcbab";
string P="ababaca";
int point=KMPMatcher(T, P);
std::cout<<"Point: "<<point<<std::endl;
return 0;
}
Boyer-Moore Algorithm
KMP와 유사하지만, 문자열을 비교할 때 첫 인덱스가 아닌 뒷 인덱스부터 비교함.

O(N+M)